{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%run -n main.py"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path = join_path(DATA_DIR, DATASET)\n",
    "# !mkdir -p {path}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for name in DATASETS:\n",
    "#     paths = (\n",
    "#         join_path(CORUS_DATA_DIR, _)\n",
    "#         for _ in CORUS_FILES[name]\n",
    "#     )\n",
    "#     records = (\n",
    "#         record\n",
    "#         for path in paths\n",
    "#         for record in load_dataset(path)\n",
    "#     )\n",
    "#     records = log_progress(records, desc=name)\n",
    "#     records = sample(records, 1000)\n",
    "\n",
    "#     path = join_path(DATA_DIR, DATASET, name + JL + GZ)\n",
    "#     items = as_jsons(records)\n",
    "#     lines = format_jl(items)\n",
    "#     dump_gz_lines(lines, path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "datasets = {}\n",
    "for name in DATASETS:\n",
    "    path = join_path(DATA_DIR, DATASET, name + JL + GZ)\n",
    "    lines = load_gz_lines(path)\n",
    "    items = parse_jl(lines)\n",
    "    datasets[name] = list(from_jsons(items, Markup))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for name in MODELS:\n",
    "#     path = join_path(DATA_DIR, name)\n",
    "#     !mkdir -p {path}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## cpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "docker = docker_client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = SPACY\n",
    "model = MODELS[model_name]()\n",
    "model.start(docker)\n",
    "model.wait()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset_name in DATASETS:\n",
    "    records = model.map(_.words for _ in datasets[dataset_name])\n",
    "    records = log_progress(records, desc=dataset_name)\n",
    "\n",
    "    path = join_path(DATA_DIR, model_name, dataset_name + JL + GZ)\n",
    "    items = as_jsons(records)\n",
    "    lines = format_jl(items)\n",
    "    dump_gz_lines(lines, path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.stop(docker)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## gpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !vast search offers | grep '1 x  GTX 1080 Ti'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model = DeeppavlovBERTModel()\n",
    "# model = SlovnetBERTModel()\n",
    "# model = StanzaModel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !vast create instance 498795 --image {model.image} --disk 30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !vast show instances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !ssh ssh4.vast.ai -p 20908 -l root -Nf -L {model.port}:localhost:{model.container_port}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for dataset_name in DATASETS:\n",
    "#     records = datasets[dataset_name]\n",
    "#     records = log_progress(records, desc=dataset_name)\n",
    "#     records = model.map(_.words for _ in records)\n",
    "\n",
    "#     path = join_path(DATA_DIR, model.name, dataset_name + JL + GZ)\n",
    "#     items = as_jsons(records)\n",
    "#     lines = format_jl(items)\n",
    "#     dump_gz_lines(lines, path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !vast destroy instance 500908"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_models = {}\n",
    "for dataset in DATASETS:\n",
    "    for model in MODELS:\n",
    "        path = join_path(DATA_DIR, model, dataset + JL + GZ)\n",
    "        lines = load_gz_lines(path)\n",
    "        items = parse_jl(lines)\n",
    "        dataset_models[dataset, model] = list(from_jsons(items, Markup))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = {}\n",
    "for dataset, model in log_progress(dataset_models):\n",
    "    preds = dataset_models[dataset, model]\n",
    "    targets = datasets[dataset]\n",
    "    scores[dataset, model] = score_markups(preds, targets)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores_table = scores_report_table(scores, DATASETS, MODELS)\n",
    "html = format_scores_report(scores_table)\n",
    "patch_readme(SYNTAX1, html, README)\n",
    "patch_readme(SYNTAX1, html, SLOVNET_README)\n",
    "HTML(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BENCH = [\n",
    "    Bench(\n",
    "        UDPIPE,\n",
    "        init=6.91,\n",
    "        disk=45 * MB,\n",
    "        ram=242 * MB,\n",
    "        speed=56.2,\n",
    "    ),\n",
    "    Bench(\n",
    "        SPACY,\n",
    "        init=9,\n",
    "        disk=140 * MB,\n",
    "        ram=579 * MB,\n",
    "        speed=41,\n",
    "    ),\n",
    "    Bench(\n",
    "        DEEPPAVLOV_BERT,\n",
    "        init=34,\n",
    "        disk=(706 + 721) * MB,  # BERT + model\n",
    "        ram=8.5 * GB,\n",
    "        speed=75,\n",
    "        device=GPU\n",
    "    ),\n",
    "    Bench(\n",
    "        SLOVNET_BERT,\n",
    "        init=5,\n",
    "        disk=504 * MB,\n",
    "        ram=3427 * MB,\n",
    "        speed=200,\n",
    "        device=GPU\n",
    "    ),\n",
    "    Bench(\n",
    "        SLOVNET,\n",
    "        init=1,\n",
    "        disk=27 * MB,\n",
    "        ram=125 * MB,\n",
    "        speed=450,\n",
    "    ),\n",
    "    Bench(\n",
    "        STANZA,\n",
    "        init=3,\n",
    "        disk=591 * MB,\n",
    "        ram=890 * MB,\n",
    "        speed=12,\n",
    "    ),\n",
    "]\n",
    "\n",
    "bench_table = bench_report_table(BENCH, MODELS)\n",
    "html = format_bench_report(bench_table)\n",
    "patch_readme(SYNTAX2, html, README)\n",
    "patch_readme(SYNTAX2, html, SLOVNET_README)\n",
    "HTML(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
